Online E-commerce websites like Amazon, Flipkart uses different recommendation models to provide different suggestions to different users. Amazon currently uses item-to-item collaborative filtering, which scales to massive data sets and produces high-quality recommendations in real-time.
Amazon Reviews data: For this case study, we are using the Electronics dataset.
E-commerce
● userId : Every user identified with a unique id
● productId : Every product identified with a unique id
● Rating : Rating of the corresponding product by the corresponding user
● timestamp : Time of the rating ( ignore this column for this exercise
#import necessary libraries
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
#importing numerical library
import numpy as np
#To handle data in the form of rows and columns
import pandas as pd
#To enable plotting graphs in jupyter notebook
import matplotlib.pyplot as plt
%matplotlib inline
#Importing library for statistical graphs
import seaborn as sns
#Importing sklearn function for splitting dataset into training and test set
from sklearn.model_selection import train_test_split
#Statistical library
from scipy.stats import norm
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import math
import json
import time
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.externals import joblib
import scipy.sparse
from scipy.sparse import csr_matrix
import warnings; warnings.simplefilter('ignore')
#importing Surprise library for collaborative filtering
from surprise import Dataset
#Importing KNN,SVD,SVDpp,SlopeOne,NMF,Normalpresdictor,KNNBaseline,KNNBasic,KNNWithMeans,KNNWithZcore
#,BaselineOnly , Coclustering, cross_validate algorithm from surprise library
#from surprise import KNNWithMeans
#from surprise import SVD
#from surprise import SVDpp
#from surprise import SlopeOne
#from surprise import NMF
#from surprise import NormalPredictor
#from surprise import KNNBaseline
#from surprise import KNNBasic
#from surprise import KNNWithMeans
#from surprise import KNNWithZScore
#from surprise import BaselineOnly
#from surprise import CoClustering
#from surprise.model_selection import cross_validate
#The surprise.accuracy module provides tools for computing accuracy metrics on a set of predictions.
from surprise import accuracy
#sklearn for feature extraction & modeling
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
#Importing dataset using pandas dataframe function
original_df = pd.read_csv('ratings_Electronics (1).csv',names = ['user_id','product_id','ratings','Timestamp'])
original_df.shape
Originally Dataset does not have any column name so i have given columns with the read_csv funtion of pandas. There are 7824482 Rows and 4 columns.
#let's have a look of first few rows
original_df.head()
#Let's have a look of datatypes of each column
original_df.info()
# Summary statistics of 'rating' variable
original_df['ratings'].describe().transpose()
#Following code dropping the column timestamp as it is irrelevant for our recommender sysytem
new_df = original_df.drop(['Timestamp'],axis = 1)
new_df.head()
#checking the prescence of missingvalues
new_df.isnull().sum()
Description - No null value found.
# find minimum and maximum ratings
print('The minimum rating is: %d' %(new_df['ratings'].min()))
print('The maximum rating is: %d' %(new_df['ratings'].max()))
# Check the distribution of ratings
with sns.axes_style('white'):
g = sns.factorplot("ratings", data= new_df, aspect=2.0,kind='count')
g.set_ylabels("Total number of ratings")
**Description - Above grap
# Number of unique user id and product id in the data
print('Number of unique USERS in Raw data = ', new_df['user_id'].nunique())
print('Number of unique ITEMS in Raw data = ', new_df['product_id'].nunique())
new_df['product_id'].nunique()
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
data = new_df['ratings'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
text = ['{:.1f} %'.format(val) for val in (data.values / new_df.shape[0] * 100)],
textposition = 'auto',
textfont = dict(color = '#000000'),
y = data.values,
)
# Create layout
layout = dict(title = 'Distribution Of {} Product-ratings'.format(new_df.shape[0]),
xaxis = dict(title = 'Rating'),
yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
# Number of ratings per product
data = new_df.groupby('product_id')['ratings'].count().clip(upper=50)
# Create trace
trace = go.Histogram(x = data.values,
name = 'ratings',
xbins = dict(start = 0,
end = 50,
size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Product (Clipped at 100)',
xaxis = dict(title = 'Number of Ratings Per Product'),
yaxis = dict(title = 'Count'),
bargap = 0.2)
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
new_df.groupby('product_id')['ratings'].count().reset_index().sort_values('ratings', ascending=False)[:10]
# Number of ratings per user
data = new_df.groupby('user_id')['ratings'].count().clip(upper=50)
# Create trace
trace = go.Histogram(x = data.values,
name = 'Ratings',
xbins = dict(start = 0,
end = 50,
size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per User (Clipped at 50)',
xaxis = dict(title = 'Ratings Per User'),
yaxis = dict(title = 'Count'),
bargap = 0.2)
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)